In [22]:
import sys
import numpy
import pandas as pd

from data_structure.dataset import Dataset
from measures.absolute_measures import *
from measures.statistical_tests import *



class Dataset(object):
    @property
    def data(self):
        return self.__data


    @property
    def protected_cols(self):
        return self.__protected_cols


    @property
    def target_cols(self):
        return self.__target_cols


    def __init__(self, data):
        if isinstance(data, str):
            # expect data to be a filename, engine=python enables auto-detection of separator
            self.__data = pd.read_csv(data, header=0, sep=None, engine='python')
        elif isinstance(data, pd.DataFrame):
            self.__data = data

        self.__protected_cols = [col for col in self.__data.columns if col.startswith('protected')]
        self.__target_cols = [col for col in self.__data.columns if col.startswith('target')]

        # check if dataset is well-formed
        if not self.__protected_cols:
            raise ValueError("The dataset should contain at least one column that describes a protection status")
        if not self.__target_cols:
            raise ValueError("The dataset should contain at least one column that describes a target variable")

        # check that protected attributes are indicated by integers
        for protected_column in self.__protected_cols:
            column_values = self.__data[protected_column]
            protection_categories = column_values.unique()
            if not all(isinstance(item, integer) for item in protection_categories):
                raise ValueError("Protection status should be indicated by integers only")


    def normalize_column(self, column_name):
        mean_col = self.data[column_name].dropna().mean()
        min_col = self.data[column_name].dropna().min()
        max_col = self.data[column_name].dropna().max()
        self.data[column_name] = self.data[column_name].apply(lambda x: (x - mean_col) / (max_col - min_col))


    def count_classification_and_category(self, target_col, protected_col, group, accepted):
        """
        counts the number of items that have the desired combination of protection status and
        classification result.
        Example: group=0 and accepted=0 returns the number of non-protected that where classified negative

        @param target_col:      name of the column in data that contains the classification results
        @param protected_col:   name of the column in data that contains the protection status
        @param group:           defines which protection status should be counted
        @param accepted:        defines which classification result should be counted

        @return: the number of occurrences of the given protection/classification combination
        0 either if the given group group does not exist or is not classified into the
        given class

        """

        # get all classification results for given group group
        classes_for_protected = self.get_all_targets_of_group(target_col, protected_col, group)
        # count those that match given acceptance state
        return (classes_for_protected == accepted).sum()


    def get_all_targets_of_group(self, target_col, protected_col, group):
        """
        returns a vector with all target variables out of a given target column for a given group

        @param target_col:      name of the column in data that contains the classification results
        @param protected_col:   name of the column in data that contains the protection status
        @param group:           defines which group (grouped by protection status) should be considered

        @return: array with target values
        """
        return self.data.loc[self.data[protected_col] == group, target_col].values


    def prob_positive_classification(self, target_col):
        """
        @return: portion of items that have been classified positively
        """

        value_counts = self.data[target_col].value_counts()
        pos_counts = value_counts.get(1, default=0)

        return pos_counts / len(self.data[target_col])



    def conditional_prob_for_group_category(self, target_col, protected_col, accepted):
        """
        calculates the conditional probability for each group (protected and favored) to be classified
        as positive (if accepted=1) or negative respectively (if accepted=0).
        Assumes that classification results are binary, either positive or negative

        @param target_col:      name of the column in data that contains the classification results
        @param protected_col:   name of the column in data that contains the protection status
        @param accepted:        int that says if the conditional probability of being accepted should be
                                calculated or the one of being rejected

        @return: a dictionary with protection status as key and conditional probability as value

        """
        if target_col not in self.target_cols:
            raise ValueError("given target column doesn't exist")

        if protected_col not in self.protected_cols:
            raise ValueError("given protected column doesn't exist")

        conditional_probs = {}
        unique, counts = np.unique(self.data[protected_col], return_counts=True)
        protected_group_counts = dict(zip(unique, counts))

        # calculate conditional probability of positive outcome given each group category
        for group_category, member_count in protected_group_counts.items():

            conditional_probs[group_category] = \
                self.count_classification_and_category(target_col, protected_col, group_category, accepted) / member_count

        return conditional_probs


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-22-02b2bfbb19b9> in <module>()
      3 import pandas as pd
      4 
----> 5 from data_structure.dataset import Dataset
      6 from measures.absolute_measures import *
      7 from measures.statistical_tests import *

ModuleNotFoundError: No module named 'data_structure'

Absolute Measures

Mean Difference

$$d = E(y^+) \mid s^0) - E(y^+ \mid s^1)$$

takes a dataset with columns that contain target values (i.e. prediction scores of the model) as well as protection status variables and calculates the mean difference of the targets of each protected group to the non-protected group. For each target column, first the variables areordered into a subset for each protected group and the mean is calculated. Then the values of the target column for the non-protected group are extracted and their mean is calculated.

Each protected mean of predictions is subtracted from the non-protected mean. dataset: data that contains all target and protected variables target_column: name of the column that contains the prediction values protected_column: name of the column that contains the protection status non-protected: the value within protected_column that describes the non-protected category zero on default

return: a python dataframe that contains the target as column name and the protection categories from protected_column as indices. Note that the non-protected category is excluded as it would contain only zeros anyway. The cells contain the values of the mean differences between the non-protected group and the particular protected one for that particular target variables. If the difference greater zero, the mean of the non-protected group was greater than the mean of the protected one, otherwise smaller.


In [16]:
def mean_difference(dataset, target_column, protected_column, non_protected=0):
    if protected_column not in dataset.protected_cols:
        raise ValueError("given protected column name doesn't exist in dataset. Check spelling.")

    if target_column not in dataset.target_cols:
        raise ValueError("given target column name doesn't exist in dataset. Check spelling.")


    result = pd.DataFrame()

    # get all protected attribute categories
    group_categories = dataset.data[protected_column].unique()

    # calculate mean of target values for the non-protected group
    target_values_nonprotected = dataset.data.loc[dataset.data[protected_column] == non_protected, target_column]
    mean_nonprotected = np.mean(target_values_nonprotected)

    # calculate mean of target values for all protected categories
    for category in group_categories:
        if category == non_protected:
            # skip non_protected category, has been done above
            continue
        else:
            target_values_protected = dataset.data.loc[dataset.data[protected_column] == category, target_column]
            mean = np.mean(target_values_protected)
            mean_diff = mean_nonprotected - mean
            df = pd.DataFrame({target_column: [mean_diff]}, index=[category])
            result = result.append(df)
    return result

Normalized Difference

  • calculates the difference between the probability of being accepted given being a favored group member and the probability of being accepted given being a protected group member. This difference is normalized by the ratio of all accepted candidates by all favored candidates.
  • Non-Discrimination is indicated when no difference in these probabilities exist. Maximum discrimination is indicated when the result is 1 (or -1 respectively), i.e. the probability of being accepted as a favored is 1 whereas it is 0 for a protected group member.

  • Only works for the binary case -> one protected group, one non-protected group, classification result is either positive or negative

  • Assumes that in the dataset the favored group is labeled with protection status 0, protected group with 1

  • Assumes that in the dataset the positive outcome is labeled as 1, negative as 0

protected_col: name of the column that contains the protection status target_col: name of the column that contains the classifier results

return:

- 0    if the probability of being accepted is equal for all groups  
    - > 0   if the probability of being accepted is higher for the non-protected group  
    - < 0   if the probability of being accepted is higher for the protected group 

In [17]:
def normalized_difference(dataset, target_col, protected_col):

    unique_prot, counts_prot = np.unique(dataset.data[protected_col], return_counts=True)
    unique_targ, counts = np.unique(dataset.data[target_col], return_counts=True)

    if len(unique_prot) > 2 or len(unique_targ) > 2:
        print("This function is only applicable for binary problems. See function docs for details.")
        return np.nan

    protected_group_counts = dict(zip(unique_prot, counts_prot))
    conditional_probs = dataset.conditional_prob_for_group_category(target_col, protected_col, 1)

    counts_pos = (dataset.data[target_col] == 1).sum()
    counts_neg = (dataset.data[target_col] == 0).sum()
    outcome_counts = {0:counts_neg, 1:counts_pos}

    prob_pos = outcome_counts[1] / len(dataset.data.index)
    prob_neg = outcome_counts[0] / len(dataset.data.index)
    prob_prot = protected_group_counts[1] / len(dataset.data.index)
    prob_fav = protected_group_counts[0] / len(dataset.data.index)

    d_max = min((prob_pos / prob_fav), (prob_neg / prob_prot))

    if d_max == 0:
        raise ZeroDivisionError

    delta = (conditional_probs[0] - conditional_probs[1]) / d_max
    return delta

In [18]:
filename = 'demo.csv'

In [19]:
dataset = Dataset(filename)
print('=========== difference of means test =============')
print(t_test_ind(dataset, 'target_score', 'protected_sex'))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-62639481eb86> in <module>()
----> 1 dataset = Dataset(filename)
      2 print('=========== difference of means test =============')
      3 print(t_test_ind(dataset, 'target_score', 'protected_sex'))

NameError: name 'Dataset' is not defined

In [ ]:
print('\n=========== mean differences ==============')
print(mean_difference(dataset, 'target_score', 'protected_sex').T)

In [ ]:
print('\n=========== normalized differences ============')
    print(normalized_difference(dataset, 'target_loan_approved', 'protected_sex'))

In [ ]:
print('\n=========== impact ratio ============')
    print(impact_ratio(dataset, 'target_loan_approved', 'protected_sex'))

In [ ]:
print('\n=========== odds ratio ============')
    print(fisher_exact_two_groups(dataset, 'target_loan_approved', 'protected_sex'))